; I use a vector type. X grows right, Y grows down, Z grows forward.
; On the FP stack and in memory it looks like {Y X Z} because sometimes I need only Y.
; The vector at [bp+si]  called 'v'.

org 100h ; assume al=0 bx=0 sp=di=-2 si=0100h bp=09??h; last 9 bytes of PSP = 0
C16  dw 16     ;=10 00  adc [bx+si],al
C196 dw 0xC4   ;=C4 00  les ax,[bx+si]   C4 used as MSB of float
RO equ $-1-4*3 ; ray origin: dd 0.0, 0.0, -512.25

;Video mode, gray gradient from default palette.
  mov al,13h
P:int 10h      ; set video mode | set palette index: bx=i dh=R ch=G cl=B
;  dec di       ; correct pixel address = -3
DI_ equ -2

;Each frame:
M:mov dx,0xA000-10-20-20-4;  ; visible pixels are A0000..AF9FF: want X=0 Y=0 in the center
  mov es,dx    ; dx:bx=YX:XX = es:0    (dx and bx are neighbors after PUSHA)

;Generate 16 gem normals at [bp+200h,300h].
BIG equ $-1 ;24770
  pusha
  mov cx,[si]  ; cx=i=16...0
PL:
  add bp,si    ; bp=address of plane
  pusha        ; bx=0

  fld1
  inc cx
  jp GP       ; nice planes, z is always +-1
  fchs
GP:fild word[-6-16 + di-DI_]  ; pushed i
  fsincos

RR:
  call STORE
  fld dword[bp+si+4] ;|x
  fild word[-6 + di-DI_]
  fidiv word[bx+si]  ;|T/196  ;|T/-15360  ;|T/16
ROTATE:
  fsincos            ;|c s x
  call LOAD_SCALE_YZ ;|sy c sz x
  call LOAD_SCALE_YZ ;|cy sy cz sz x
  fsubp st3,st0      ;|sy cz sz-cy x
  faddp              ;|sy+cz sz-cy x    ;| Y X Z = R(y,z),x
  inc bx
  jpo RR             ;|loop 3 times
  call STORE  ; store unnormalized normal (it's scaled by sqrt2)

  popa
  loop PL
  popa

;Each pixel:
X:   ;cx=T dx:bx=YX:XX(init=9fca:0000) di=adr_pixel(init=0) bp=09?? si=0100 ah=0
  inc dx        ;  = es:0    (dx and bx are neighbors after PUSHA)
X2:
  fninit        ; adr:     -18 -16 -14 -12 -10  -8  -6  -4  -2
  pusha         ; stack:    di  si  bp  sp  bx  dx  cx  ax   0
  xor di,di     ; s16:  pixadr 100 9??  -2  ..X..Y  T result

;Compute ray direction.
  fild word[byte BIG + si-100h] ; Z=27000
  fild word[di-9]  ; X   |rD.xyz p.d
  fild word[di-8]  ; Y

NORMALIZE_STORE: ; { a.y .x .z } --> { n.y .x .z } a[bp](unnormalized) bx=bp
  call STORE     ;|a*a   ; [bp]=a (unnormalized)
  mov bx,bp
  call DOT
  fsqrt
  fld1
  fdivrp st1   ;|rsqrt(a*a)        ...    will be: |rd.x rd.y rd.z
  call LOAD_SCALE
  call STORE

;Hit the gem.
GEM_OUTER: ; rd[bp] --> cf=1_if_hit di=address_of_hit_facet   ; clobbers ax,bx,cx
  fild dword[si]  ;|tfront=0 tback=huge
  fldz
  mov cx,[si]     ; cx=i=16...1
  lea bx,[bp+si]  ; bx = current gem; gem normals are at [bp+200h,300h,...]
G:
  fild word[byte C196 + si-100h] ; planes have distance 196

;ray-plane intersection
  call DOT        ;|D=pn*rd pd tf tb
  push bp
  mov bp,RO-0x100
  call DOT        ;|pn*ro D pd tf tb
  pop bp
  fsubp st2,st0 ;|D N=pd-pn*ro
  ftst
  fnstsw ax
  sahf       ; cf=1 if we're in front of the plane
  fdivp st1,st0 ;|t=N/D tf tb
  jnc GBACK
GFRONT:
  fcom st0,st1
  fnstsw ax
  sahf
  jbe GNEXT     ;if t>tf { tf=t; di=hit_address = current; }
  fst st1
  mov dx,bx
  jmp GNEXT
GBACK:
  fcom st0,st2
  fnstsw ax
  sahf
  jae GNEXT     ;if t<tf { tb=t; }
  fst st2
GNEXT:
  fstp st0
  fcom
  fnstsw ax
  sahf          ;if tf>=fb { no_hit: cf=0; early exit } else { cf=1 }
  jae GEXIT
  lea bx,[bx+si]; don't set flags
  loop G
GEXIT:          ;i=adr_facet, bp=rd
  fld dword[bp+si]
  jnc SKY
  mov bx,dx
  call DOT       ;|rd*best.n    ;i[bx] n[bp]
  fmul dword[bx+si] ; ky
  fsubr dword[bp+si]
SKY:
  fmul st0         ;|rd.y^2 and skip gamma correction (so actually ^4)
  fimul word[byte C196 + si-100h]   ; *16
  fistp word[di-4] ; pushed ax
  popa

; 4-bit builtin gray palette with cheapo (6$) dithering. (The multiplier must be 239, not 255.)
;  add al,16

  shr al,4
;  cmp bl,al
;  adc al,0
;  and al,0Fh
  add al,16
  
  stosb
  add bx,0xCCCD ;dx:bx = YXX += 0000CCCD
  jnc X2
  jnz X   ;do 65536 iterations

  inc cx  ; T++
  in al,60h
  dec ax
  jnz M   ; fallthrough

STORE: ; { a.y .x .z } --> a[bp]
  fstp dword[bp+si]
  fstp dword[bp+si+4]
  fstp dword[bp+si+8]
  ret

LOAD_SCALE: ; { k } a[bp] --> { k*y k*x k*z }
  fld dword[bp+si+4]
  fmul st1           ;|kx k
LOAD_SCALE_YZ:
  fld dword[bp+si+8]
  fmul st2
  fxch st2           ;|k kx kz
  fmul dword[bp+si]  ;|ky kx kz
  ret

DOT:  ; a[bp] b[bx] --> { (a dot b).y .x .z }
  fld dword[bp+si]
  fmul dword[bx+si]
  fld dword[bp+si+4]
  fmul dword[bx+si+4]
  faddp
  fld dword[bp+si+8]
  fmul dword[bx+si+8]
  faddp
  ret
